{
 "metadata": {
  "name": "",
  "signature": "sha256:a1bb315e8952cfd2c59dfda023d5728aa89fb7edb1281e96e7b7ce850491395e"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "heading",
     "level": 1,
     "metadata": {},
     "source": [
      "Text Minning with sklearn\n"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import matplotlib.pyplot as plt\n",
      "from sklearn.datasets import load_files\n",
      "from sklearn import datasets\n",
      "import numpy as np\n",
      "%matplotlib inline\n",
      "\n",
      "\n",
      "\n",
      "# setting graph parameters for graph\n",
      "\n",
      "plt.rcParams['figure.figsize'] = 10,7.2\n",
      "plt.rcParams['axes.grid'] = True\n",
      "plt.gray()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# loading text data and extracting data to train our model and test our model\n",
      "\n",
      "categories = ['alt.atheism','sci.space','talk.religion.misc','comp.graphics']\n",
      "\n",
      "Train_subset = load_files('datasets/20news-bydate/20news-bydate-train',categories=categories, encoding='latin -1')\n",
      "Test_subset = load_files('datasets/20news-bydate/20news-bydate-test',categories=categories, encoding='latin -1')\n",
      "\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#Vectorizing the Training data\n",
      "from sklearn.feature_extraction.text import TfidfVectorizer \n",
      "Vectorizer = TfidfVectorizer(min_df=2)\n",
      "X_train = Vectorizer.fit_transform(Train_subset.data)\n",
      "y_train = Train_subset.target\n",
      "\n",
      "#inner dimmensions agree\n",
      "print X_train.shape[0] == y_train.shape[0]\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#training the model using Naive Bayes Classifier\n",
      "from sklearn.naive_bayes import MultinomialNB\n",
      "\n",
      "#train classifier and get score\n",
      "clf = MultinomialNB(alpha=1).fit(X_train,y_train)\n",
      "print ('Train score:{0:.1f}%'.format(clf.score(X_train,y_train)*100))\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#test classifier and get score (slight over fitting)\n",
      "X_test = Vectorizer.transform(Test_subset.data)\n",
      "y_test = Test_subset.target\n",
      "\n",
      "print ('Test score:{0:.1f}%'.format(clf.score(X_test,y_test)*100))\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Demensionality Reduction and Visualization\n",
      "\n",
      "from sklearn.decomposition import RandomizedPCA\n",
      "from itertools import cycle\n",
      "\n",
      "X_train_pca = RandomizedPCA(n_components=2).fit_transform(X_train)\n",
      "target_names = Train_subset.target_names\n",
      "colors = ['r','g','b','y','m','c'] \n",
      "\n",
      "for i,c in zip(set(y_train),colors):\n",
      "    plt.scatter(X_train_pca[y_train == i,0],X_train_pca[y_train == i,1],c=c,label=target_names[i])\n",
      "plt.legend(loc='best')    \n",
      "\n",
      "    "
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Model Performance\n",
      "from sklearn.metrics import classification_report,confusion_matrix\n",
      "\n",
      "print classification_report(Test_subset.target,clf.predict(X_test),target_names=target_names)\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}